In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
from warnings import filterwarnings 
filterwarnings("ignore")
D:\anaconda files\lib\site-packages\scipy\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
In [2]:
data=pd.read_csv("C:\\Users\\laxma\\Downloads\\placement-dataset.csv")
data
Out[2]:
city cgpa iq placement
0 New York 6.8 123.0 1
1 Los Angeles 5.9 106.0 0
2 Chicago NaN 121.0 0
3 New York 7.4 132.0 1
4 Los Angeles 5.8 142.0 0
... ... ... ... ...
95 Chicago 4.3 200.0 0
96 New York 4.4 42.0 0
97 Los Angeles 6.7 182.0 1
98 Chicago 6.3 103.0 1
99 New York 6.2 113.0 1

100 rows × 4 columns

In [3]:
data.head()
Out[3]:
city cgpa iq placement
0 New York 6.8 123.0 1
1 Los Angeles 5.9 106.0 0
2 Chicago NaN 121.0 0
3 New York 7.4 132.0 1
4 Los Angeles 5.8 142.0 0
In [4]:
data.tail()
Out[4]:
city cgpa iq placement
95 Chicago 4.3 200.0 0
96 New York 4.4 42.0 0
97 Los Angeles 6.7 182.0 1
98 Chicago 6.3 103.0 1
99 New York 6.2 113.0 1
In [5]:
data.describe()
Out[5]:
cgpa iq placement
count 92.000000 96.000000 100.000000
mean 5.965217 117.916667 0.460000
std 1.164911 46.913508 0.500908
min 3.300000 1.000000 0.000000
25% 5.000000 90.000000 0.000000
50% 6.000000 122.000000 0.000000
75% 6.825000 146.750000 1.000000
max 8.500000 233.000000 1.000000
In [6]:
data.describe
Out[6]:
<bound method NDFrame.describe of            city  cgpa     iq  placement
0      New York   6.8  123.0          1
1   Los Angeles   5.9  106.0          0
2       Chicago   NaN  121.0          0
3      New York   7.4  132.0          1
4   Los Angeles   5.8  142.0          0
..          ...   ...    ...        ...
95      Chicago   4.3  200.0          0
96     New York   4.4   42.0          0
97  Los Angeles   6.7  182.0          1
98      Chicago   6.3  103.0          1
99     New York   6.2  113.0          1

[100 rows x 4 columns]>
In [7]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   city       100 non-null    object 
 1   cgpa       92 non-null     float64
 2   iq         96 non-null     float64
 3   placement  100 non-null    int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 3.2+ KB
In [8]:
data.isnull().sum()
Out[8]:
city         0
cgpa         8
iq           4
placement    0
dtype: int64
In [9]:
data=data.dropna()
data
Out[9]:
city cgpa iq placement
0 New York 6.8 123.0 1
1 Los Angeles 5.9 106.0 0
3 New York 7.4 132.0 1
4 Los Angeles 5.8 142.0 0
5 Chicago 7.1 48.0 1
... ... ... ... ...
95 Chicago 4.3 200.0 0
96 New York 4.4 42.0 0
97 Los Angeles 6.7 182.0 1
98 Chicago 6.3 103.0 1
99 New York 6.2 113.0 1

88 rows × 4 columns

In [10]:
data.isnull().sum()
Out[10]:
city         0
cgpa         0
iq           0
placement    0
dtype: int64
In [11]:
data.shape
Out[11]:
(88, 4)
In [12]:
data.duplicated().sum()
Out[12]:
0
In [13]:
data.columns
Out[13]:
Index(['city', 'cgpa', 'iq', 'placement'], dtype='object')
In [14]:
#VISUALIZATION
In [15]:
plt.bar(data['placement'],data['cgpa'])
plt.xticks(rotation=90)
plt.show()
In [16]:
fig=px.bar(data,x='iq',y='city',color='city')
fig.show()
In [17]:
plt.figure(figsize=(10,4))
sns.countplot(x='placement', data=data, color='r')
plt.show()
In [18]:
sns.lineplot(x='city', y='placement', data=data).set_title('placement with city')
Out[18]:
Text(0.5, 1.0, 'placement with city')
In [19]:
sns.barplot(data['placement'],data['iq'],color='r')
plt.xticks(rotation=90)
plt.show()
In [20]:
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data,x='cgpa', y='city')
plt.title('cgpa and city')
plt.xlabel('cgpa')
plt.ylabel('city')
plt.show()
In [21]:
sns.displot(data["city"])
Out[21]:
<seaborn.axisgrid.FacetGrid at 0x20132f70f40>
In [22]:
sns.boxplot(x='cgpa',y='iq',data=data)
plt.xticks(rotation=90)
Out[22]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38]),
 [Text(0, 0, '3.3'),
  Text(1, 0, '3.5'),
  Text(2, 0, '3.9'),
  Text(3, 0, '4.0'),
  Text(4, 0, '4.3'),
  Text(5, 0, '4.4'),
  Text(6, 0, '4.6'),
  Text(7, 0, '4.7'),
  Text(8, 0, '4.8'),
  Text(9, 0, '4.9'),
  Text(10, 0, '5.0'),
  Text(11, 0, '5.1'),
  Text(12, 0, '5.2'),
  Text(13, 0, '5.3'),
  Text(14, 0, '5.4'),
  Text(15, 0, '5.7'),
  Text(16, 0, '5.8'),
  Text(17, 0, '5.9'),
  Text(18, 0, '6.0'),
  Text(19, 0, '6.1'),
  Text(20, 0, '6.2'),
  Text(21, 0, '6.3'),
  Text(22, 0, '6.4'),
  Text(23, 0, '6.5'),
  Text(24, 0, '6.6'),
  Text(25, 0, '6.7'),
  Text(26, 0, '6.8'),
  Text(27, 0, '6.9'),
  Text(28, 0, '7.0'),
  Text(29, 0, '7.1'),
  Text(30, 0, '7.3'),
  Text(31, 0, '7.4'),
  Text(32, 0, '7.5'),
  Text(33, 0, '7.6'),
  Text(34, 0, '7.8'),
  Text(35, 0, '8.0'),
  Text(36, 0, '8.1'),
  Text(37, 0, '8.3'),
  Text(38, 0, '8.5')])
In [23]:
sns.countplot(data=data, x='cgpa',color='yellowgreen')
plt.xticks(rotation=90)
Out[23]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38]),
 [Text(0, 0, '3.3'),
  Text(1, 0, '3.5'),
  Text(2, 0, '3.9'),
  Text(3, 0, '4.0'),
  Text(4, 0, '4.3'),
  Text(5, 0, '4.4'),
  Text(6, 0, '4.6'),
  Text(7, 0, '4.7'),
  Text(8, 0, '4.8'),
  Text(9, 0, '4.9'),
  Text(10, 0, '5.0'),
  Text(11, 0, '5.1'),
  Text(12, 0, '5.2'),
  Text(13, 0, '5.3'),
  Text(14, 0, '5.4'),
  Text(15, 0, '5.7'),
  Text(16, 0, '5.8'),
  Text(17, 0, '5.9'),
  Text(18, 0, '6.0'),
  Text(19, 0, '6.1'),
  Text(20, 0, '6.2'),
  Text(21, 0, '6.3'),
  Text(22, 0, '6.4'),
  Text(23, 0, '6.5'),
  Text(24, 0, '6.6'),
  Text(25, 0, '6.7'),
  Text(26, 0, '6.8'),
  Text(27, 0, '6.9'),
  Text(28, 0, '7.0'),
  Text(29, 0, '7.1'),
  Text(30, 0, '7.3'),
  Text(31, 0, '7.4'),
  Text(32, 0, '7.5'),
  Text(33, 0, '7.6'),
  Text(34, 0, '7.8'),
  Text(35, 0, '8.0'),
  Text(36, 0, '8.1'),
  Text(37, 0, '8.3'),
  Text(38, 0, '8.5')])
In [24]:
sns.histplot(data, x="cgpa", hue="placement", multiple="stack", bins = 50, kde=True)
Out[24]:
<AxesSubplot:xlabel='cgpa', ylabel='Count'>
In [25]:
#MODEL BUILDING
In [26]:
X = data.drop(['city'], axis=1)
y = data['placement']
In [27]:
X.head()
Out[27]:
cgpa iq placement
0 6.8 123.0 1
1 5.9 106.0 0
3 7.4 132.0 1
4 5.8 142.0 0
5 7.1 48.0 1
In [28]:
y.head()
Out[28]:
0    1
1    0
3    1
4    0
5    1
Name: placement, dtype: int64
In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2)
In [30]:
X_train.shape, X_test.shape
Out[30]:
((44, 3), (44, 3))
In [31]:
X_train.dtypes
Out[31]:
cgpa         float64
iq           float64
placement      int64
dtype: object
In [32]:
from sklearn.tree import DecisionTreeClassifier
DTree = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)
DTree.fit(X_train, y_test)
Out[32]:
DecisionTreeClassifier(max_depth=3, random_state=0)
In [33]:
y_pred = DTree.predict(X_test)
y_pred
Out[33]:
array([1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1],
      dtype=int64)
In [34]:
from sklearn.metrics import accuracy_score
print('model accuracy score with criterion gini index: {0:04f}'. format (accuracy_score(y_test, y_pred)))
model accuracy score with criterion gini index: 0.318182
In [35]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix
Out[35]:
array([[11, 12],
       [18,  3]], dtype=int64)
In [36]:
plt.figure(figsize=(10,8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='crest', cbar=False)
Out[36]:
<AxesSubplot:>
In [37]:
from sklearn.metrics import classification_report
class_report = classification_report(y_test, y_pred)
print(class_report)
              precision    recall  f1-score   support

           0       0.38      0.48      0.42        23
           1       0.20      0.14      0.17        21

    accuracy                           0.32        44
   macro avg       0.29      0.31      0.29        44
weighted avg       0.29      0.32      0.30        44

In [38]:
plt.figure(figsize=(8,6))
from sklearn import tree
tree.plot_tree(DTree.fit(X_train, y_train))
Out[38]:
[Text(0.5, 0.75, 'X[2] <= 0.5\ngini = 0.499\nsamples = 44\nvalue = [21, 23]'),
 Text(0.25, 0.25, 'gini = 0.0\nsamples = 21\nvalue = [21, 0]'),
 Text(0.75, 0.25, 'gini = 0.0\nsamples = 23\nvalue = [0, 23]')]
In [ ]:
 
In [ ]: